Here we show the code to reproduce the analyses of: Risso and Pagnotta (2020). Per-sample standardization and asymmetric winsorization lead to accurate classification of RNA-seq expression profiles. In preparation.

This file belongs to the repository: https://github.com/drisso/awst_analysis.

The code is released with license GPL v3.0.

Install and load awst

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("drisso/awst")

The synthetic dataset

… We then created \(k=5\) groups of samples, each made of \(M=30\) replicate samples, randomly selected (with replacement) from the original set of 80. For each group, we randomly selected (without replacement) \(J=500\) genes, whose expression was altered according to the following multiplicative model. \[ \tilde{y}_j = y_j \cdot (0.001 + r_j), \quad j=1,\ldots,J, \] where \(y_j\) denotes the observed expression of gene \(j\), \(\tilde{y}\) denotes the perturbed expression, and \(r_j\) is the realization of a Gamma random variable with shape parameter \(a=0.5\) and scale parameter \(s=1\).

#BiocManager::install("seqc") # uncomment if necessary
rm(list = ls())
set.seed(20200413)
library(seqc)
  
ddata <- get("ILM_aceview_gene_MAY")
ddata <- ddata[!is.na(ddata$EntrezID),]
ddata <- ddata[!ddata$IsERCC,]
  
feature_annotation <- data.frame(ddata[, 1:3], row.names = ddata$EntrezID)
dim(ddata <- as.matrix(ddata[, -c(1:4)]))    #[1] 19701   384
sum(duplicated(feature_annotation$Symbol))   #[1] 0
sum(duplicated(feature_annotation$EntrezID)) #[1] 0
row.names(ddata) <- feature_annotation$EntrezID
  
dim(sample_annotation.df <- data.frame(row.names = colnames(ddata), sample = colnames(ddata))) #[1] 384   1
sample_annotation.df$lane <- substr(sample_annotation.df$sample, 6, 7)
sample_annotation.df$flow_cell <- substr(sample_annotation.df$sample, 17, 17)
sample_annotation.df$sample <- substr(sample_annotation.df$sample, 1, 3)
  
sample_annotation.df <- sample_annotation.df[grep("A", sample_annotation.df$sample),]
dim(ddata <- ddata[, rownames(sample_annotation.df)]) #[1] 19701    80
  
M <- 30
nnumbers <- c("01", "02", "03", "04", "05", "06", "07", "08", "09", paste(10:M))
tmp <- c(paste0("A", nnumbers), paste0("B", nnumbers), paste0("C", nnumbers), 
         paste0("D", nnumbers), paste0("E", nnumbers))
  
design.df <- data.frame(row.names = tmp, sample = tmp, original.sample = NA) 
design.df$original.sample[1:M]           <- sample(rownames(sample_annotation.df), M, replace = TRUE)
design.df$original.sample[(M+1):(2*M)]   <- sample(rownames(sample_annotation.df), M, replace = TRUE)
design.df$original.sample[(2*M+1):(3*M)] <- sample(rownames(sample_annotation.df), M, replace = TRUE)
design.df$original.sample[(3*M+1):(4*M)] <- sample(rownames(sample_annotation.df), M, replace = TRUE)
design.df$original.sample[(4*M+1):(5*M)] <- sample(rownames(sample_annotation.df), M, replace = TRUE)
#  table(design.df$original.sample)
  
k <- "A"
wwhich <- grep(k, design.df$sample)
synthetic_data <- ddata[, design.df$original.sample[wwhich]]
colnames(synthetic_data) <- design.df$sample[wwhich]
  
the_genes <- rownames(synthetic_data)
no_of_altered_genes <- 500

genes_to_alterate <- sample(the_genes, no_of_altered_genes, replace = FALSE)
the_genes <- the_genes[-which(the_genes %in% genes_to_alterate)]

for(jj in 1:M) {
  tmp <- 0.001 + rgamma(length(genes_to_alterate), shape = 0.5, scale = 1)
  synthetic_data[genes_to_alterate, jj] <- synthetic_data[genes_to_alterate, jj] * tmp
}
  
#  k <- "B"
for(k in c("B", "C", "D", "E")) {
  wwhich <- grep(k, design.df$sample)
  tmp_data <- ddata[, design.df$original.sample[wwhich]]
  colnames(tmp_data) <- design.df$sample[wwhich]
    
  genes_to_alterate <- sample(the_genes, no_of_altered_genes, replace = FALSE)
  the_genes <- the_genes[-which(the_genes %in% genes_to_alterate)]

  for(jj in 1:M) {
    tmp <- 0.001 + rgamma(length(genes_to_alterate), shape = 0.5, scale = 1)
    tmp_data[genes_to_alterate, jj] <- tmp_data[genes_to_alterate, jj] * tmp
  }
  
    
  synthetic_data <- cbind(synthetic_data, tmp_data)
}
dim(ddata <- floor(synthetic_data)) #[1] 19701   150
  
annotation.df <- data.frame(samples = colnames(ddata), row.names = colnames(ddata))
annotation.df$sample <- substr(annotation.df$samples, 1, 1)
annotation.df$sample.col <- factor(annotation.df$sample)
levels(annotation.df$sample.col) <- clust.col <- c("gold", "red", "green2", "blue", "cyan")
names(clust.col) <- unique(annotation.df$sample)
   
save(ddata, annotation.df, feature_annotation, clust.col, file = "synthetic20200413.RData")
  
#tmp <- cbind(annotation.df, t(ddata))
#write.table(tmp, file = "synthetic20200413.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
#write.table(feature_annotation, file = "synthetic20200413_features_annotation.tsv", sep = "\t", quote = FALSE, row.names = FALSE)

Figure 2

Performance of different data pre-processing paired with the hierarchical clustering (Euclidean distance, Ward’s linkage). a) AWST data pre-processing; b) Hart’s data pre-processing; c) Radovich’s data pre-processing; d) TCGA data pre-processing. The “sample” bar indicates the true partition, and the “clust” bar indicates the inferred partition obtained by cutting the tree to get 5 clusters. The Calinski-Harabasz curve is superimposed in each panel.

Supplementary Figure 1

(Extended Figure 2) Performance of different data pre-processing paired with the hierarchical clustering (Euclidean distance, Ward’s linkage). a) AWST data pre-processing; b) Hart’s data pre-processing; c) Radovich’s data pre-processing; d) TCGA data pre-processing; e) FPKM pre-processing with top 2,500 features according to standard-deviation; f) FPKM pre-processing with top 5,000 features according to standard-deviation. The “sample” bar indicates the true partition, and the “clust” bar indicates the inferred partition obtained by cutting the tree to get 5 clusters. The Calinski-Harabasz curve is superimposed in each panel.

Supplementary Figure 2

Study for the synthetic data of the effects of data pre-processing on the compactness of groups respect to the theoretical partition and Euclidean distance. top/left) AWST; top/right) Hart; mid/left) Radovich; mid/right) TCGA; bottom/left) FPKM pre-processing with top 2500 features according to standard-deviation; bottom/right) FPKM pre-processing with top 5000 features according to standard-deviation.

Figure 3

Performance of different data pre-processing paired with ConsensusClusterPlus (inner and outer average linkage with Pearson’s correlation as distance matrix). top/left) AWST data pre-processing; top/right) Hart’s data pre-processing; bottom/left) Radovich’s protocol; bottom/right) TCGA data protocol; bottom/left) - Le figure sono nello stesso ordine di Figure 2

Supplementary Figure 3

(Extended Figure 3) Performance of different data pre-processing paired with ConsensusClusterPlus (inner and outer average linkage with Pearson’s correlation as distance matrix). top/left) AWST data pre-processing; top/right) Hart’s data pre-processing; mid/left) Radovich’s protocol; mid/right) TCGA data protocol; bottom/left) FPKM pre-processing with top 2{,}500 features according to standard-deviation; bottom/right) FPKM pre-processing with top 5{,}000 features according to standard-deviation. The “sample’’ bar indicates the true partition, and the”consensus’’ bar indicates the inferred partition obtained by requiring 5 clusters.

Supplementary Figure 4

Performance of different data pre-processing paired with ConsensusClusterPlus (Euclidean distance and PAM method). top/left) AWST data pre-processing; top/right) Hart’s data pre-processing; mid/left) Radovich’s pre-processing; mid/right) TCGA data pre-processing; bottom/left) FPKM pre-processing with top 2{,}500 features according to standard-deviation; bottom/right) FPKM pre-processing with top 5{,}000 features according to standard-deviation. The “sample’’ bar indicates the true partition, and the”consensus’’ bar indicates the inferred partition obtained by requiring 5 clusters.

AWST procedure

hclust (Euclidean/Ward)

## adjusted-Rand index (clues): 0.9832

ConsensusClusterPlus (default parameters)

Result of ConsensuClusterPlus with default setup
(innerLinkage=“average”, finalLinkage=“average”, distance=“pearson”)

## adjusted-Rand index (clues): 1

ConsensusClusterPlus (PAM)

Result of ConsensuClusterPlus with clusterAlg = “pam” and distance = “euclidean” (other parameters left to default)

## adjusted-Rand index (clues): 1

Hart 2013

Finding the active genes in deep RNA-seq gene expression studies

hclust (Euclidean/Ward)

## adjusted-Rand index (clues): 0.6779

ConsensusClusterPlus (default parameters)

Result of ConsensuClusterPlus with default setup
(innerLinkage=“average”, finalLinkage=“average”, distance=“pearson”)

## adjusted-Rand index (clues): 0.6952

ConsensusClusterPlus (PAM)

Result of ConsensuClusterPlus with clusterAlg = “pam” and distance = “euclidean” (other parameters left to default)

## adjusted-Rand index (clues): 1

Radovich (2018) procedure

The Integrated Genomic Landscape of Thymic Epithelial Tumors

hclust (Euclidean/Ward)

## adjusted-Rand index (clues): 0.0848

ConsensusClusterPlus (default parameters)

Result of ConsensuClusterPlus with default setup
(innerLinkage=“average”, finalLinkage=“average”, distance=“pearson”)

## adjusted-Rand index (clues): 1

ConsensusClusterPlus (PAM)

Result of ConsensuClusterPlus with clusterAlg = “pam” and distance = “euclidean” (other parameters left to default)

## adjusted-Rand index (clues): 0.1212

TCGA (2015) procedure

Comprehensive, Integrative Genomic Analysis of Diffuse Lower-Grade Gliomas - Supplementary Appendix (see pages 23-24)

hclust (Euclidean/Ward)

## adjusted-Rand index (clues): 0.0013

ConsensusClusterPlus (default parameters)

Result of ConsensuClusterPlus with default setup
(innerLinkage=“average”, finalLinkage=“average”, distance=“pearson”)

## adjusted-Rand index (clues): 0.0947

ConsensusClusterPlus (PAM)

Result of ConsensuClusterPlus with clusterAlg = “pam” and distance = “euclidean” (other parameters left to default)

## adjusted-Rand index (clues): 0.0013

FPKM: Top 2500 features

## adjusted-Rand index (clues): 0.4637

ConsensusClusterPlus (default parameters)

Result of ConsensuClusterPlus with default setup
(innerLinkage=“average”, finalLinkage=“average”, distance=“pearson”)

## adjusted-Rand index (clues): 0.5208

ConsensusClusterPlus (PAM)

Result of ConsensuClusterPlus with clusterAlg = “pam” and distance = “euclidean” (other parameters left to default)

## adjusted-Rand index (clues): 1

FPKM: Top 5000 features

## adjusted-Rand index (clues): 0.0013

ConsensusClusterPlus (default parameters)

Result of ConsensuClusterPlus with default setup
(innerLinkage=“average”, finalLinkage=“average”, distance=“pearson”)

## adjusted-Rand index (clues): 0.1812

ConsensusClusterPlus (PAM)

Result of ConsensuClusterPlus with clusterAlg = “pam” and distance = “euclidean” (other parameters left to default)

## adjusted-Rand index (clues): 0.6112

Adjusted Rand indeces

AWST Hart Radovich TCGA FPKM2500 FPKM5K
HC 0.9832 0.6779 0.0848 0.0013 0.4637 0.0013
CCP1 1.0000 0.6952 1.0000 0.0947 0.5208 0.1812
CCP2 1.0000 1.0000 0.1212 0.0013 1.0000 0.6112

HC) hirerachical clustering with Ward’s likage and Euclidean distance; CPP1) ConsensusClusterPlus with average innner and outer linkage, and Pearson’s correlation as distance; CCP2) ConsensusClusterPlus with PAM and Euclidean distance;

Session info

## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.1
## 
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=it_IT.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=it_IT.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=it_IT.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=it_IT.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] EDASeq_2.10.0              ShortRead_1.34.2          
##  [3] GenomicAlignments_1.12.2   SummarizedExperiment_1.6.5
##  [5] DelayedArray_0.2.7         matrixStats_0.56.0        
##  [7] Rsamtools_1.28.0           GenomicRanges_1.28.6      
##  [9] GenomeInfoDb_1.12.3        Biostrings_2.44.2         
## [11] XVector_0.16.0             IRanges_2.10.5            
## [13] S4Vectors_0.14.7           BiocParallel_1.10.1       
## [15] Biobase_2.36.2             BiocGenerics_0.22.1       
## [17] heatmap3_1.1.7             steFunctions_2019.04.29   
## [19] awst_0.0.4                 clues_0.6.2.2             
## [21] dendextend_1.13.4          cluster_2.0.6             
## [23] knitr_1.28                
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6            bit64_0.9-7             doParallel_1.0.15      
##  [4] RColorBrewer_1.1-2      tools_3.4.4             R6_2.4.1               
##  [7] DBI_1.1.0               colorspace_1.4-1        tidyselect_1.0.0       
## [10] gridExtra_2.3           bit_1.1-15.2            compiler_3.4.4         
## [13] rtracklayer_1.36.6      scales_1.1.0            genefilter_1.58.1      
## [16] DESeq_1.28.0            stringr_1.4.0           digest_0.6.25          
## [19] rmarkdown_2.1           R.utils_2.9.2           pkgconfig_2.0.3        
## [22] htmltools_0.4.0         highr_0.8               rlang_0.4.5            
## [25] RSQLite_2.2.0           hwriter_1.3.2           dplyr_0.8.5            
## [28] R.oo_1.23.0             RCurl_1.98-1.1          magrittr_1.5           
## [31] GenomeInfoDbData_0.99.0 Matrix_1.2-18           Rcpp_1.0.4             
## [34] munsell_0.5.0           viridis_0.5.1           lifecycle_0.2.0        
## [37] R.methodsS3_1.8.0       stringi_1.4.6           yaml_2.2.1             
## [40] zlibbioc_1.22.0         grid_3.4.4              blob_1.2.1             
## [43] crayon_1.3.4            lattice_0.20-35         splines_3.4.4          
## [46] GenomicFeatures_1.28.5  annotate_1.54.0         pillar_1.4.3           
## [49] fastcluster_1.1.25      geneplotter_1.54.0      codetools_0.2-15       
## [52] biomaRt_2.32.1          XML_3.99-0.3            glue_1.3.2             
## [55] evaluate_0.14           latticeExtra_0.6-28     vctrs_0.2.4            
## [58] foreach_1.4.8           gtable_0.3.0            purrr_0.3.3            
## [61] assertthat_0.2.1        ggplot2_3.3.0           xfun_0.12              
## [64] aroma.light_3.6.0       xtable_1.8-4            survival_3.1-12        
## [67] viridisLite_0.3.0       tibble_2.1.3            iterators_1.0.12       
## [70] AnnotationDbi_1.38.2    memoise_1.1.0